## Constructs Figure 1 from Appendix C

# Import packages
import pandas as pd
import matplotlib.pyplot as plt

# File locations; change as needed
selected_hearings_location = "D:/Dropbox/congressworks/drafts/measurement paper/measurement_replication/topics/expert_hearingids_with_legacy.csv"
hearing_data_location = "D:/Dropbox/congressworks/drafts/measurement paper/measurement_replication/topics/US-Legislative_congressional_hearings-21.4.csv"

# Create dictionaries (pairings according to Comparative Agendas Project codebooks)
topics_dict = {1:"Macroeconomics",
               2:"Civil Rights",
               3:"Health",
               4:"Agriculture",
               5:"Labor",
               6:"Education",
               7:"Environment",
               8:"Energy",
               9:"Immigration",
               10:"Transportation",
               12:"Law and Crime",
               13:"Social Welfare",
               14:"Housing",
               15:"Dom. Commerce",
               16:"Defense",
               17:"Technology",
               18:"Foreign Trade",
               19:"Intl. Affairs",
               20:"Gov. Operations",
               21:"Public Lands",
               23:"Culture"}
committees_dict = {111:"House Government Reform and Oversight Committee",
                   213: "Senate Governmental Affairs Committee",
                   0: "N/A"}

# Read in csv of the selected hearing ids and titles
selected_hearings = pd.read_csv(selected_hearings_location)
hearing_data = pd.read_csv(hearing_data_location)

# Select all hearings from oversight committees between 1995 and 2020 for comparison
condition1 = (hearing_data["Committee1"] == 111) | (hearing_data["Committee1"] == 213) | (hearing_data["Committee2"] == 111) | (hearing_data["Committee2"] == 213)
condition2 = (hearing_data["year"] >= 1995) & (hearing_data["year"] <= 2020)
oversight_hearings = hearing_data[condition1 & condition2]

# Merge selected hearing data with CAP hearing data
selected_hearings = pd.merge(left = selected_hearings, right = oversight_hearings, how = "inner", left_on = "legacyid", right_on = "source")

# Perform column transformations
oversight_hearings = oversight_hearings[["source", "year", "majortopic", "Committee1","Committee2"]]
oversight_hearings = oversight_hearings.rename(columns = {"source":"id2","majortopic":"topicid","Committee1":"committee1id","Committee2":"committee2id", "year":"year"})
oversight_hearings["topic"] = oversight_hearings["topicid"].map(topics_dict)
oversight_hearings["committee1"] = oversight_hearings["committee1id"].map(committees_dict)
oversight_hearings["committee2"] = oversight_hearings["committee2id"].map(committees_dict)

selected_hearings = selected_hearings[["hearingid", "title_x", "legacyid", "year", "majortopic", "Committee1", "Committee2"]]
selected_hearings = selected_hearings.rename(columns = {"hearingid":"id1","legacyid":"id2","title_x":"title","majortopic":"topicid","Committee1":"committee1id","Committee2":"committee2id", "year":"year"})
selected_hearings["topic"] = selected_hearings["topicid"].map(topics_dict)
selected_hearings["committee1"] = selected_hearings["committee1id"].map(committees_dict)
selected_hearings["committee2"] = selected_hearings["committee2id"].map(committees_dict)

# Count hearing by topic and divide by total number of hearing to determine topic distribution for each comp group
grouped_oversight_hearings = oversight_hearings.groupby(by = "topic").size()
num_hearings_1 = grouped_oversight_hearings.sum()
print("Total oversight hearings 1995-2020: " + str(num_hearings_1))
grouped_oversight_hearings = grouped_oversight_hearings / num_hearings_1
grouped_oversight_hearings = grouped_oversight_hearings.sort_values(ascending = False)

grouped_selected_hearings = selected_hearings.groupby(by = "topic").size()
num_hearings_3 = grouped_selected_hearings.sum()
print("Total selected oversight hearings: " + str(num_hearings_3))
grouped_selected_hearings = grouped_selected_hearings / num_hearings_3
grouped_selected_hearings = grouped_selected_hearings.sort_values(ascending = False)

# Concatenate topic distribution data into one dataframe
grouped_data = pd.concat([grouped_oversight_hearings, grouped_selected_hearings], axis = 1).reset_index()
grouped_data = grouped_data.fillna(0)
grouped_data = grouped_data.rename(columns = {0: "% of all oversight hearings 1995-2020 (n = {})".format(num_hearings_1), 1: "% of oversight hearings scored by experts (n = {})".format(num_hearings_3)})

# Plot this topic distribution data in a triple bar graph
ax = grouped_data.plot.bar(x = "topic")
ax.set_ylabel("% of Hearings with Topic", fontsize = 24)
ax.set_xlabel("")
plt.xticks(fontsize = 16, rotation = 90)
plt.yticks(fontsize = 16)
plt.legend(fontsize=16)
plt.tight_layout()
plt.show()